Assume that we have downloaded all the files from g-drive, let’s get all except the ones we have identified with dodgy landmarks

# get list of the local data files that we prepared earlier
nested_df_of_pp_data <-  
  read_rds(here("analysis", "data", "derived-data", "nested_df_of_pp_data.rds"))

bad_specimens <- 
  read_table("exclude-these.md", col_names = 'bad_specimens')

# read these into R, and assign names to each data frame
nested_df_of_pp_data_good_ones <- 
  nested_df_of_pp_data %>% 
  filter(!specimen %in% bad_specimens) %>% 
  separate(specimen, 
           into = c('genus', 'species', str_glue('X{1:7}')), 
           by = "_")

Let’s do the Generalised Procrustes Analysis, first we do it for each taxa to see if there are any problems:

nested_df_of_pp_data_good_ones_arrays_21_landmarks <- 
nested_df_of_pp_data_good_ones %>% 
  select(species, data) %>% 
  # drop specimens that don't have exactly 21 landmarks
   mutate(n_row = map_int(data, nrow)) %>% 
  filter(n_row == 21) %>% 
  # convert data frame to array for the geomorph pkg
  mutate(data = map(data, ~arrange(.x, landmark) %>% 
                           select(-landmark))) %>% 
  mutate(data_array = map(data, simplify2array))

# convert to 3d array
L <-  nested_df_of_pp_data_good_ones_arrays_21_landmarks$data_array
names(L) <-  nested_df_of_pp_data_good_ones_arrays_21_landmarks$species

# our function to convert a list to a 3d array
list_to_3d_array <- function(the_list){
  array(unlist(the_list), 
         dim = c(nrow(the_list[[1]]), 
         ncol(the_list[[1]]), 
         length(the_list)))
}

# use this function to make a set of 3d arrays, one for each genus
gori_array <- L %>% keep(names(.) == 'gorilla') %>%  list_to_3d_array
afar_array <- L %>% keep(names(.) == 'afarensis') %>%  list_to_3d_array
nean_array <- L %>% keep(names(.) == 'neanderthalensis') %>%  list_to_3d_array
sapi_array <- L %>% keep(names(.) == 'sapiens') %>%  list_to_3d_array
nale_array <- L %>% keep(names(.) == 'naledi') %>%  list_to_3d_array

# compute GPA for each group as a quality check

library(geomorph) # wont work in rstudio.cloud

# Obtain Procrustes coordinates and inspect

# gorilla
gori_array_gpa <- gpagen(gori_array)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=================================================================| 100%
summary(gori_array_gpa)
## 
## Call:
## gpagen(A = gori_array) 
## 
## 
## 
## Generalized Procrustes Analysis
## with Partial Procrustes Superimposition
## 
## 21 fixed landmarks
## 0 semilandmarks (sliders)
## 3-dimensional landmarks
## 6 GPA iterations to converge
## 
## 
## Consensus (mean) Configuration
## 
##                  X           Y            Z
##  [1,]  0.454869890 -0.06154339 -0.046803310
##  [2,]  0.162866017  0.06302082 -0.065861253
##  [3,] -0.038330559 -0.01338632 -0.058877443
##  [4,] -0.178518404 -0.09457430 -0.041464530
##  [5,] -0.215201657 -0.23994206 -0.015421874
##  [6,]  0.118112943 -0.24400863 -0.008616734
##  [7,]  0.080161222 -0.01933096  0.035230320
##  [8,]  0.210795797 -0.09780613  0.020409215
##  [9,]  0.011089663 -0.16745272  0.034795912
## [10,]  0.004484312 -0.09448510  0.038129062
## [11,]  0.131263439 -0.03392320  0.043599674
## [12,] -0.066412973  0.10158178  0.067173372
## [13,]  0.017045563  0.11230710  0.055838412
## [14,]  0.134490766  0.17718368  0.077194263
## [15,] -0.253044010 -0.03498057  0.030501565
## [16,] -0.209323513  0.06587484 -0.043277218
## [17,] -0.124375550  0.07744351 -0.031121180
## [18,] -0.085070444  0.12524097 -0.040897013
## [19,]  0.009851652  0.12626579 -0.030300255
## [20,]  0.107350900  0.21112297 -0.037210578
## [21,] -0.272105056  0.04139193  0.016979592
# afarensis
afar_array_gpa <- gpagen(afar_array)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=================================================================| 100%
summary(afar_array_gpa)
## 
## Call:
## gpagen(A = afar_array) 
## 
## 
## 
## Generalized Procrustes Analysis
## with Partial Procrustes Superimposition
## 
## 21 fixed landmarks
## 0 semilandmarks (sliders)
## 3-dimensional landmarks
## 6 GPA iterations to converge
## 
## 
## Consensus (mean) Configuration
## 
##                 X            Y             Z
##  [1,] -0.27172244  0.130017162  0.0557969676
##  [2,] -0.11096212  0.097111838  0.1192077658
##  [3,]  0.06649541  0.081539718  0.0306640482
##  [4,]  0.19762442  0.036233252  0.0006708144
##  [5,]  0.25648361  0.011014213 -0.1042805070
##  [6,]  0.03098469  0.183618376 -0.1165284046
##  [7,] -0.02625814  0.038702713 -0.0395905754
##  [8,] -0.13747050  0.143690218 -0.0607993384
##  [9,]  0.08120912  0.040054338 -0.1456174446
## [10,]  0.01293139  0.006155746 -0.0883308383
## [11,] -0.10630058  0.063889792 -0.0470790832
## [12,]  0.03822376 -0.145363050 -0.0612435672
## [13,] -0.44983610 -0.220228227 -0.1315245779
## [14,] -0.18441086 -0.097721383  0.0334486789
## [15,]  0.20628722 -0.118747363 -0.0146890315
## [16,]  0.19026810 -0.061209594  0.0743335744
## [17,]  0.09153290 -0.033854126  0.0788941058
## [18,]  0.04875980 -0.022170498  0.1131480082
## [19,] -0.03035351 -0.017066223  0.1077410817
## [20,] -0.12270285 -0.008379135  0.1725986884
## [21,]  0.21921669 -0.107287768  0.0231796345
# neanderthals
nean_array_gpa <- gpagen(nean_array)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=================================================================| 100%
summary(nean_array_gpa)
## 
## Call:
## gpagen(A = nean_array) 
## 
## 
## 
## Generalized Procrustes Analysis
## with Partial Procrustes Superimposition
## 
## 21 fixed landmarks
## 0 semilandmarks (sliders)
## 3-dimensional landmarks
## 5 GPA iterations to converge
## 
## 
## Consensus (mean) Configuration
## 
##                 X             Y             Z
##  [1,]  0.41721300 -0.0294344872  0.0120883524
##  [2,]  0.17135658  0.0003442522  0.0397710569
##  [3,] -0.04207355 -0.0276031385  0.0200409409
##  [4,] -0.20776205 -0.0520182901  0.0320078748
##  [5,] -0.28701344 -0.1296011090  0.0144479791
##  [6,]  0.05796978 -0.2454021716  0.0152840130
##  [7,]  0.08295134 -0.0428826098 -0.0199281000
##  [8,]  0.22634003 -0.1329279228  0.0153236378
##  [9,] -0.06161922 -0.1795884051 -0.0235011808
## [10,] -0.02689960 -0.0945399466 -0.0276557137
## [11,]  0.16081245 -0.0584909556 -0.0256346188
## [12,] -0.06865813  0.0634681899 -0.0391894644
## [13,]  0.04738283  0.0862764818 -0.0320921113
## [14,]  0.19153464  0.1230375237 -0.0491799760
## [15,] -0.28343612  0.0185534920 -0.0234900755
## [16,] -0.22723245  0.1085122242  0.0194286012
## [17,] -0.08094070  0.1070026290  0.0119897569
## [18,] -0.03050222  0.1218467640  0.0162674652
## [19,]  0.04513091  0.1166691831  0.0108150880
## [20,]  0.19429484  0.1886701233  0.0322421485
## [21,] -0.27884890  0.0581081732  0.0009643259
# sapiens
sapi_array_gpa <- gpagen(sapi_array)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |=================================================================| 100%
summary(sapi_array_gpa)
## 
## Call:
## gpagen(A = sapi_array) 
## 
## 
## 
## Generalized Procrustes Analysis
## with Partial Procrustes Superimposition
## 
## 21 fixed landmarks
## 0 semilandmarks (sliders)
## 3-dimensional landmarks
## 3 GPA iterations to converge
## 
## 
## Consensus (mean) Configuration
## 
##                 X            Y           Z
##  [1,]  0.37313614 -0.030516955 -0.04322407
##  [2,]  0.14954989  0.051437267 -0.14392901
##  [3,] -0.03102554 -0.006592835 -0.10338419
##  [4,] -0.18776368 -0.037458761 -0.10529973
##  [5,] -0.26539312 -0.154297131 -0.02411010
##  [6,]  0.06395297 -0.238397555 -0.06392663
##  [7,]  0.06445254 -0.053639678  0.04691435
##  [8,]  0.03426251 -0.166380177  0.02939327
##  [9,]  0.13862084 -0.140757416  0.01146733
## [10,] -0.02485129 -0.117304190  0.04986439
## [11,]  0.17004060 -0.076623548  0.06444496
## [12,] -0.08025562  0.024380847  0.16483078
## [13,]  0.03430747  0.072985290  0.08991625
## [14,]  0.19386231  0.124897023  0.14626719
## [15,] -0.26269038 -0.003372304  0.05372207
## [16,] -0.19952088  0.112378255 -0.04275809
## [17,] -0.08419665  0.106379852 -0.03145522
## [18,] -0.03843382  0.126838919 -0.04256578
## [19,]  0.04342505  0.125664877 -0.03689378
## [20,]  0.17376082  0.202858950 -0.05330540
## [21,] -0.26524015  0.077519269  0.03403142
# naledi
nale_array_gpa <- gpagen(nale_array)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=================================================================| 100%
summary(nale_array_gpa)
## 
## Call:
## gpagen(A = nale_array) 
## 
## 
## 
## Generalized Procrustes Analysis
## with Partial Procrustes Superimposition
## 
## 21 fixed landmarks
## 0 semilandmarks (sliders)
## 3-dimensional landmarks
## 6 GPA iterations to converge
## 
## 
## Consensus (mean) Configuration
## 
##                  X             Y             Z
##  [1,]  0.341203079 -0.0797840136 -0.0222702606
##  [2,]  0.149273752 -0.0211254807  0.0429648878
##  [3,] -0.012160662  0.0382470638  0.0608582345
##  [4,] -0.140760077  0.0527706237  0.0402177562
##  [5,] -0.271240861  0.1257590696 -0.0191541035
##  [6,]  0.043977594  0.0999521165 -0.0584964019
##  [7,]  0.471835139  0.3114229851  0.0244160216
##  [8,]  0.167774607 -0.0155266639 -0.0785773424
##  [9,] -0.107158946  0.0554629390 -0.0687570426
## [10,] -0.021378858 -0.0263937276 -0.0004328671
## [11,]  0.075297050 -0.0507414680 -0.0295033370
## [12,] -0.128356537 -0.0340558114 -0.0467616488
## [13,] -0.004278572 -0.1251292691 -0.0057587571
## [14,]  0.112871562 -0.1708631715 -0.0585195677
## [15,] -0.251941288  0.0478736855 -0.0105360307
## [16,] -0.189148192  0.0195254296  0.0282906586
## [17,] -0.128333044  0.0007175733  0.0231432044
## [18,] -0.084972398 -0.0117069869  0.0505579562
## [19,]  0.050937083 -0.0747082225  0.0641497136
## [20,]  0.153546150 -0.1741718673  0.0612555045
## [21,] -0.226986582  0.0324751965  0.0029134222

Let’s have a look at the 3d plots of the GPA coords. We need to find the specimens that have points that don’t cluster tightly with the GPA and exclude those specimens from the sample.

# function to reshape the data for plotly
three_d_array_to_data_frame <- function(three_d_array, n = 21){
  
  # storage list
  the_list <- vector('list', length = dim(three_d_array)[3])
  
  # do extraction from array
  for(i in seq_len(dim(three_d_array)[3])){
    the_list[[i]] <- three_d_array[ , , i]
  }
  
  # make into data frame
  df1 <- data.frame(do.call('rbind', the_list))
  
  # label landmarks
  df1$landmark <- rep(1:n, dim(three_d_array)[3])
  
  df1
  
}
library(plotly)

# gorilla
gori_array_gpa_coords_df <- 
  three_d_array_to_data_frame(gori_array_gpa$coords )

plot_ly(gori_array_gpa_coords_df,
        x = ~X1,
        y = ~X2,
        z = ~X3,
        text = ~paste('Landmark: ', landmark)) %>% 
   add_markers() 

Gorilla: problems with landmark 1, 5, and 6

# afarensis
afar_array_gpa_coords_df <- 
  three_d_array_to_data_frame(afar_array_gpa$coords )

plot_ly(afar_array_gpa_coords_df,
        x = ~X1,
        y = ~X2,
        z = ~X3,
        text = ~paste('Landmark: ', landmark)) %>% 
   add_markers() 

Afarensis: problems with landmark 13

# neanderthals
nean_array_gpa_coords_df <- 
  three_d_array_to_data_frame(nean_array_gpa$coords )

plot_ly(nean_array_gpa_coords_df,
        x = ~X1,
        y = ~X2,
        z = ~X3,
        text = ~paste('Landmark: ', landmark)) %>% 
   add_markers() 

Neanderthals: all look good!

# sapiens
sapi_array_gpa_coords_df <- 
  three_d_array_to_data_frame(sapi_array_gpa$coords )

plot_ly(sapi_array_gpa_coords_df,
        x = ~X1,
        y = ~X2,
        z = ~X3,
        text = ~paste('Landmark: ', landmark)) %>% 
   add_markers() 

Sapiens: all look good!

# sapiens
nale_array_gpa_coords_df <- 
  three_d_array_to_data_frame(nale_array_gpa$coords )

plot_ly(nale_array_gpa_coords_df,
        x = ~X1,
        y = ~X2,
        z = ~X3,
        text = ~paste('Landmark: ', landmark)) %>% 
   add_markers() 

Naledi: problems with lanmdmark 7

So, how about we drop these landmarks from all and try the GPA again? 1, 5, 6, 7, 13

landmarks_to_drop <- c(1, 5, 6, 7, 13)
n_after_dropped <- 21 - length(landmarks_to_drop) 

# drop these landmarks from all specimens 
L_dropped_some_landmarks <- 
  map(L, ~.x %>% 
            as.data.frame %>% 
            slice(-landmarks_to_drop))

# use this function to make a set of 3d arrays, one for each genus
gori_array_dropped <- L_dropped_some_landmarks %>% keep(names(.) == 'gorilla') %>%  list_to_3d_array
afar_array_dropped <- L_dropped_some_landmarks %>% keep(names(.) == 'afarensis') %>%  list_to_3d_array
nean_array_dropped <- L_dropped_some_landmarks %>% keep(names(.) == 'neanderthalensis') %>%  list_to_3d_array
sapi_array_dropped <- L_dropped_some_landmarks %>% keep(names(.) == 'sapiens') %>%  list_to_3d_array
nale_array_dropped <- L_dropped_some_landmarks %>% keep(names(.) == 'naledi') %>%  list_to_3d_array

# compute GPA for each group as a quality check

library(geomorph) # wont work in rstudio.cloud

# Obtain Procrustes coordinates and inspect

# gorilla
gori_array_dropped_gpa <- gpagen(gori_array_dropped)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=================================================================| 100%
# afarensis
afar_array_dropped_gpa <- gpagen(afar_array_dropped)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=================================================================| 100%
# neanderthals
nean_array_dropped_gpa <- gpagen(nean_array_dropped)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=================================================================| 100%
# sapiens
sapi_array_dropped_gpa <- gpagen(sapi_array_dropped)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |=================================================================| 100%
# naledi
nale_array_dropped_gpa <- gpagen(nale_array_dropped)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |=================================================================| 100%

Take another look after we have removed some problem landmarks:

library(plotly)

# gorilla
gori_array_dropped_gpa_coords_df <- 
  three_d_array_to_data_frame(gori_array_dropped_gpa$coords, 
                              n = n_after_dropped)

plot_ly(gori_array_dropped_gpa_coords_df,
        x = ~X1,
        y = ~X2,
        z = ~X3,
        text = ~paste('Landmark: ', landmark)) %>% 
   add_markers() 
# afarensis
afar_array_dropped_gpa_coords_df <- 
  three_d_array_to_data_frame(afar_array_dropped_gpa$coords, 
                              n = n_after_dropped)

plot_ly(afar_array_dropped_gpa_coords_df,
        x = ~X1,
        y = ~X2,
        z = ~X3,
        text = ~paste('Landmark: ', landmark)) %>% 
   add_markers() 
# neanderthals
nean_array_dropped_gpa_coords_df <- 
  three_d_array_to_data_frame(nean_array_dropped_gpa$coords, 
                              n = n_after_dropped)

plot_ly(nean_array_dropped_gpa_coords_df,
        x = ~X1,
        y = ~X2,
        z = ~X3,
        text = ~paste('Landmark: ', landmark)) %>% 
   add_markers() 
# sapiens
sapi_array_dropped_gpa_coords_df <- 
  three_d_array_to_data_frame(sapi_array_dropped_gpa$coords, 
                              n = n_after_dropped)

plot_ly(sapi_array_dropped_gpa_coords_df,
        x = ~X1,
        y = ~X2,
        z = ~X3,
        text = ~paste('Landmark: ', landmark)) %>% 
   add_markers() 
# naledi
nale_array_dropped_gpa_coords_df <- 
  three_d_array_to_data_frame(nale_array_dropped_gpa$coords, 
                              n = n_after_dropped)

plot_ly(nale_array_dropped_gpa_coords_df,
        x = ~X1,
        y = ~X2,
        z = ~X3,
        text = ~paste('Landmark: ', landmark)) %>% 
   add_markers() 

Now that we have done a little bit of quality control on the individual taxa, let’s do the GPA on them all combined, as one sample.

We can also drop some more points that are very far from the GPA clusters here:

additional_landmarks_to_drop <- c(4, 9, 10, 11, 12, 15)
landmarks_to_drop_second <-  unique(c(landmarks_to_drop, additional_landmarks_to_drop))
n_after_dropped_second <-  21 - length(landmarks_to_drop_second) 

# drop these landmarks from all specimens 
L_dropped_some_landmarks_second <- 
  map(L, ~.x %>% 
            as.data.frame %>% 
            slice(-landmarks_to_drop_second))
# compute GPA
all_specimens_3d_array <- list_to_3d_array(L_dropped_some_landmarks_second)
all_gpa <- gpagen(all_specimens_3d_array)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=================================================================| 100%
# convert to data frame
all_gpa_gpa_coords_df <- 
  three_d_array_to_data_frame(all_gpa$coords, 
                              n = n_after_dropped_second)

# inspect 3d plot
plot_ly(all_gpa_gpa_coords_df,
        x = ~X1,
        y = ~X2,
        z = ~X3,
        text = ~paste('Landmark: ', landmark)) %>% 
   add_markers() 

Let’s try the PCA

gp <- as.factor(names(L_dropped_some_landmarks_second))

# with the shapes pkg
library(shapes)
out<-procGPA(all_specimens_3d_array)


shapes_plot <- data.frame(PC1 = out$rawscores[,1], 
                          PC2 = out$rawscores[,2], 
                          taxa = gp)

library(ggplot2)
ggplot(shapes_plot, 
       aes(x = PC1, 
           y = PC2, 
           color=taxa)) + 
  geom_point(size = 5) + 
  theme_bw()